1. Introducción y Contexto

1.1 Descripción del Dataset

El dataset hotel_bookings.csv contiene información detallada sobre reservas realizadas en dos hoteles portugueses: un City Hotel ubicado en Lisboa y un Resort Hotel en el Algarve. Los datos abarcan el período de 2015 a 2017 y proporcionan una visión comprehensiva del comportamiento de las reservas hoteleras.

# Dimensiones del dataset
cat("Dimensiones del dataset:\n")
## Dimensiones del dataset:
cat("- Observaciones:", nrow(df_csv), "\n")
## - Observaciones: 119390
cat("- Variables:", ncol(df_csv), "\n\n")
## - Variables: 32
# Primeras filas
kable(head(df_csv, 5), caption = "Primeras 5 observaciones del dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                font_size = 10) %>%
  scroll_box(width = "100%")
Primeras 5 observaciones del dataset
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
Resort Hotel 0 342 2015 July 27 1 0 0 2 0 0 BB PRT Direct Direct 0 0 0 C C 3 No Deposit NULL NULL 0 Transient 0 0 0 Check-Out 2015-07-01
Resort Hotel 0 737 2015 July 27 1 0 0 2 0 0 BB PRT Direct Direct 0 0 0 C C 4 No Deposit NULL NULL 0 Transient 0 0 0 Check-Out 2015-07-01
Resort Hotel 0 7 2015 July 27 1 0 1 1 0 0 BB GBR Direct Direct 0 0 0 A C 0 No Deposit NULL NULL 0 Transient 75 0 0 Check-Out 2015-07-02
Resort Hotel 0 13 2015 July 27 1 0 1 1 0 0 BB GBR Corporate Corporate 0 0 0 A A 0 No Deposit 304 NULL 0 Transient 75 0 0 Check-Out 2015-07-02
Resort Hotel 0 14 2015 July 27 1 0 2 2 0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 240 NULL 0 Transient 98 0 1 Check-Out 2015-07-03

1.2 Variables del Dataset

El dataset contiene 32 variables que describen diferentes aspectos de cada reserva:

# Crear tabla descriptiva de variables
var_desc <- data.frame(
  Variable = names(df_csv),
  Tipo = sapply(df_csv, class),
  Valores_Unicos = sapply(df_csv, function(x) length(unique(x))),
  Valores_NA = sapply(df_csv, function(x) sum(is.na(x)))
)

datatable(var_desc, 
          options = list(pageLength = 10, scrollX = TRUE),
          caption = "Descripción de variables del dataset")

2. Limpieza y Preparación de Datos

2.1 Detección de Valores Faltantes

# Calcular porcentaje de valores faltantes
missing_data <- data.frame(
  Variable = names(df_csv),
  NA_Count = sapply(df_csv, function(x) sum(is.na(x))),
  NA_Percentage = round(sapply(df_csv, function(x) sum(is.na(x))/length(x)*100), 2)
) %>%
  filter(NA_Count > 0) %>%
  arrange(desc(NA_Count))

if(nrow(missing_data) > 0) {
  kable(missing_data, caption = "Variables con valores faltantes") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
  
  # Visualización interactiva
  p <- ggplot(missing_data, aes(x = reorder(Variable, NA_Percentage), 
                                 y = NA_Percentage)) +
    geom_bar(stat = "identity", fill = "#FF6B6B") +
    coord_flip() +
    labs(title = "Porcentaje de Valores Faltantes por Variable",
         x = "Variable", y = "Porcentaje (%)") +
    theme_minimal()
  
  ggplotly(p)
} else {
  cat("No se detectaron valores faltantes en el dataset.\n")
}

2.2 Detección de Valores Atípicos

# Variables numéricas para análisis de outliers
numeric_vars <- df_csv %>% 
  select_if(is.numeric) %>%
  select(lead_time, stays_in_weekend_nights, stays_in_week_nights, 
         adults, children, babies, adr)

# Función para detectar outliers
detect_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower <- Q1 - 1.5 * IQR
  upper <- Q3 + 1.5 * IQR
  sum(x < lower | x > upper, na.rm = TRUE)
}

outliers_summary <- data.frame(
  Variable = names(numeric_vars),
  Outliers = sapply(numeric_vars, detect_outliers),
  Porcentaje = round(sapply(numeric_vars, detect_outliers) / nrow(df_csv) * 100, 2)
)

kable(outliers_summary, caption = "Resumen de valores atípicos") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Resumen de valores atípicos
Variable Outliers Porcentaje
lead_time lead_time 3005 2.52
stays_in_weekend_nights stays_in_weekend_nights 265 0.22
stays_in_week_nights stays_in_week_nights 3354 2.81
adults adults 29710 24.88
children children 8590 7.19
babies babies 917 0.77
adr adr 3793 3.18
# Boxplots interactivos
plot_ly(data = df_csv, y = ~adr, type = "box", name = "ADR") %>%
  layout(title = "Distribución de ADR (Average Daily Rate) - Detección de Outliers",
         yaxis = list(title = "ADR (€)"))

2.3 Creación de Variables Derivadas

# Crear variables adicionales para el análisis
df_csv <- df_csv %>%
  mutate(
    # Total de noches
    total_nights = stays_in_weekend_nights + stays_in_week_nights,
    
    # Total de huéspedes
    total_guests = adults + children + babies,
    
    # Fecha de llegada (convertir mes a número primero)
    month_num = match(as.character(arrival_date_month), month.name),
    arrival_date = as.Date(paste(arrival_date_year, 
                                  month_num, 
                                  arrival_date_day_of_month, sep = "-"),
                           format = "%Y-%m-%d"),
    
    # Temporada
    season = case_when(
      arrival_date_month %in% c("December", "January", "February") ~ "Invierno",
      arrival_date_month %in% c("March", "April", "May") ~ "Primavera",
      arrival_date_month %in% c("June", "July", "August") ~ "Verano",
      TRUE ~ "Otoño"
    ),
    
    # Categoría de lead time
    lead_time_category = case_when(
      lead_time == 0 ~ "Mismo día",
      lead_time <= 7 ~ "1 semana",
      lead_time <= 30 ~ "1 mes",
      lead_time <= 90 ~ "3 meses",
      lead_time <= 180 ~ "6 meses",
      TRUE ~ "Más de 6 meses"
    ),
    
    # Cambio de habitación (convertir a character para comparar)
    room_changed = ifelse(as.character(reserved_room_type) != as.character(assigned_room_type), 1, 0),
    
    # Ingresos totales
    total_revenue = adr * total_nights
  ) %>%
  select(-month_num)  # Eliminar variable temporal

cat("Variables derivadas creadas exitosamente.\n")
## Variables derivadas creadas exitosamente.
cat("Nuevas dimensiones:", dim(df_csv), "\n")
## Nuevas dimensiones: 119390 39

3. Análisis Exploratorio de Datos (EDA)

3.1 Análisis de Cancelaciones

# Resumen de cancelaciones
cancel_summary <- df_csv %>%
  group_by(hotel, is_canceled) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(hotel) %>%
  mutate(percentage = round(count / sum(count) * 100, 2))

# Visualización interactiva
p1 <- ggplot(cancel_summary, aes(x = hotel, y = count, 
                                  fill = factor(is_canceled))) +
  geom_bar(stat = "identity", position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(values = c("#4ECDC4", "#FF6B6B"),
                    labels = c("No cancelada", "Cancelada")) +
  labs(title = "Tasa de Cancelación por Tipo de Hotel",
       x = "Tipo de Hotel", y = "Porcentaje", fill = "Estado") +
  theme_minimal()

ggplotly(p1)
# Tabla resumen
kable(cancel_summary %>% 
        pivot_wider(names_from = is_canceled, values_from = c(count, percentage)),
      caption = "Resumen de cancelaciones por hotel") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Resumen de cancelaciones por hotel
hotel count_0 count_1 percentage_0 percentage_1
City Hotel 46228 33102 58.27 41.73
Resort Hotel 28938 11122 72.24 27.76

3.1.1 Factores que Influyen en las Cancelaciones

# Lead time vs cancelación
p2 <- df_csv %>%
  group_by(lead_time_category, is_canceled) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(lead_time_category) %>%
  mutate(percentage = count / sum(count) * 100) %>%
  filter(is_canceled == 1) %>%
  ggplot(aes(x = reorder(lead_time_category, percentage), 
             y = percentage, fill = percentage)) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "#FFE66D", high = "#FF6B6B") +
  coord_flip() +
  labs(title = "Tasa de Cancelación según Lead Time",
       x = "Categoría de Lead Time", y = "% Cancelaciones") +
  theme_minimal() +
  theme(legend.position = "none")

ggplotly(p2)
# Deposit type vs cancelación
p3 <- df_csv %>%
  group_by(deposit_type, is_canceled) %>%
  summarise(count = n(), .groups = "drop") %>%
  ggplot(aes(x = deposit_type, y = count, fill = factor(is_canceled))) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("#4ECDC4", "#FF6B6B"),
                    labels = c("No cancelada", "Cancelada")) +
  labs(title = "Cancelaciones según Tipo de Depósito",
       x = "Tipo de Depósito", y = "Número de Reservas", fill = "Estado") +
  theme_minimal()

ggplotly(p3)

3.2 Análisis Temporal

# Reservas por mes y año
monthly_bookings <- df_csv %>%
  filter(!is.na(arrival_date)) %>%
  mutate(year_month = format(arrival_date, "%Y-%m")) %>%
  group_by(year_month, hotel) %>%
  summarise(count = n(), .groups = "drop")

p4 <- ggplot(monthly_bookings, aes(x = year_month, y = count, 
                                    color = hotel, group = hotel)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = c("#FF6B6B", "#4ECDC4")) +
  labs(title = "Evolución Temporal de Reservas por Hotel",
       x = "Mes-Año", y = "Número de Reservas", color = "Hotel") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplotly(p4)
# Estacionalidad
seasonal_data <- df_csv %>%
  group_by(season, hotel) %>%
  summarise(
    total_bookings = n(),
    avg_adr = mean(adr, na.rm = TRUE),
    cancellation_rate = mean(is_canceled) * 100,
    .groups = "drop"
  )

p5 <- plot_ly(seasonal_data, x = ~season, y = ~total_bookings, 
              color = ~hotel, type = "bar") %>%
  layout(title = "Reservas por Temporada y Hotel",
         xaxis = list(title = "Temporada"),
         yaxis = list(title = "Número de Reservas"),
         barmode = "group")

p5

3.3 Análisis de Ingresos (ADR)

# Distribución de ADR
p6 <- plot_ly(df_csv %>% filter(adr > 0 & adr < 500), 
              x = ~adr, color = ~hotel, type = "histogram",
              colors = c("#FF6B6B", "#4ECDC4")) %>%
  layout(title = "Distribución de ADR por Hotel",
         xaxis = list(title = "ADR (€)"),
         yaxis = list(title = "Frecuencia"),
         barmode = "overlay")

p6
# ADR promedio por mes
adr_monthly <- df_csv %>%
  filter(adr > 0, !is.na(arrival_date)) %>%
  mutate(month = format(arrival_date, "%B")) %>%
  group_by(month, hotel) %>%
  summarise(avg_adr = mean(adr, na.rm = TRUE), .groups = "drop")

# Ordenar meses
month_order <- c("January", "February", "March", "April", "May", "June",
                 "July", "August", "September", "October", "November", "December")
adr_monthly$month <- factor(adr_monthly$month, levels = month_order)

p7 <- ggplot(adr_monthly, aes(x = month, y = avg_adr, 
                               fill = hotel, group = hotel)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("#FF6B6B", "#4ECDC4")) +
  labs(title = "ADR Promedio por Mes",
       x = "Mes", y = "ADR Promedio (€)", fill = "Hotel") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplotly(p7)

3.4 Análisis Geográfico

# Top 15 países
top_countries <- df_csv %>%
  filter(country != "NULL") %>%
  group_by(country) %>%
  summarise(
    total_bookings = n(),
    cancellation_rate = mean(is_canceled) * 100,
    avg_adr = mean(adr, na.rm = TRUE)
  ) %>%
  arrange(desc(total_bookings)) %>%
  head(15)

p8 <- plot_ly(top_countries, x = ~reorder(country, total_bookings), 
              y = ~total_bookings, type = "bar",
              marker = list(color = ~cancellation_rate,
                           colorscale = "Reds",
                           showscale = TRUE,
                           colorbar = list(title = "% Cancel."))) %>%
  layout(title = "Top 15 Países por Número de Reservas",
         xaxis = list(title = "País"),
         yaxis = list(title = "Número de Reservas"))

p8
# Tabla de países
datatable(top_countries, 
          options = list(pageLength = 15, scrollX = TRUE),
          caption = "Top 15 países por reservas") %>%
  formatRound(columns = c("cancellation_rate", "avg_adr"), digits = 2)

3.5 Análisis de Segmentos de Mercado

# Distribución de segmentos
market_data <- df_csv %>%
  group_by(market_segment, hotel) %>%
  summarise(
    count = n(),
    avg_adr = mean(adr, na.rm = TRUE),
    cancellation_rate = mean(is_canceled) * 100,
    .groups = "drop"
  )

p9 <- plot_ly(market_data, x = ~market_segment, y = ~count, 
              color = ~hotel, type = "bar") %>%
  layout(title = "Distribución de Reservas por Segmento de Mercado",
         xaxis = list(title = "Segmento"),
         yaxis = list(title = "Número de Reservas"),
         barmode = "stack")

p9
# Tabla resumen por segmento
kable(market_data %>% 
        arrange(desc(count)), 
      caption = "Resumen por segmento de mercado",
      digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Resumen por segmento de mercado
market_segment hotel count avg_adr cancellation_rate
Online TA City Hotel 38748 118.92 37.40
Online TA Resort Hotel 17729 113.43 35.24
Offline TA/TO City Hotel 16747 93.02 42.83
Groups City Hotel 13975 84.92 68.86
Offline TA/TO Resort Hotel 7472 74.66 15.23
Direct Resort Hotel 6513 111.67 13.48
Direct City Hotel 6093 119.48 17.33
Groups Resort Hotel 5836 66.45 42.39
Corporate City Hotel 2986 83.12 21.47
Corporate Resort Hotel 2309 51.56 15.20
Complementary City Hotel 542 2.60 11.81
Aviation City Hotel 237 100.14 21.94
Complementary Resort Hotel 201 3.66 16.42
Undefined City Hotel 2 15.00 100.00

4. Insights Clave y Hallazgos

4.1 Principales Descubrimientos

# Calcular métricas clave
insights <- list(
  total_bookings = nrow(df_csv),
  cancellation_rate = mean(df_csv$is_canceled) * 100,
  avg_lead_time = mean(df_csv$lead_time),
  avg_adr = mean(df_csv$adr, na.rm = TRUE),
  most_common_country = names(sort(table(df_csv$country), decreasing = TRUE))[1],
  peak_month = names(sort(table(df_csv$arrival_date_month), decreasing = TRUE))[1]
)

# Crear tabla de insights
insights_df <- data.frame(
  Métrica = c("Total de Reservas", "Tasa de Cancelación (%)", 
              "Lead Time Promedio (días)", "ADR Promedio (€)",
              "País Más Común", "Mes Pico"),
  Valor = c(
    format(insights$total_bookings, big.mark = ","),
    round(insights$cancellation_rate, 2),
    round(insights$avg_lead_time, 0),
    round(insights$avg_adr, 2),
    insights$most_common_country,
    insights$peak_month
  )
)

kable(insights_df, caption = "Métricas Clave del Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), 
                full_width = FALSE)
Métricas Clave del Dataset
Métrica Valor
Total de Reservas 119,390
Tasa de Cancelación (%) 37.04
Lead Time Promedio (días) 104
ADR Promedio (€) 101.83
País Más Común PRT
Mes Pico August

4.2 Patrones Identificados

Cancelaciones:

  • Las reservas con lead time superior a 6 meses presentan las tasas de cancelación más altas
  • Los depósitos no reembolsables tienen tasas de cancelación significativamente menores
  • El City Hotel tiene mayor tasa de cancelación que el Resort Hotel

Estacionalidad:

  • La demanda presenta picos claros en verano (julio-agosto)
  • El ADR promedio también aumenta durante los meses de verano
  • El Resort Hotel muestra mayor estacionalidad que el City Hotel

Ingresos:

  • Existe gran variabilidad en el ADR, con outliers significativos
  • Los meses de verano y temporadas altas presentan ADR más elevados
  • El segmento “Online TA” genera el mayor volumen de reservas

5. Conclusiones y Recomendaciones

5.1 Conclusiones

El análisis del dataset de reservas hoteleras revela patrones complejos en el comportamiento de los clientes y oportunidades claras para optimizar la gestión hotelera:

  1. Gestión de Cancelaciones: Las cancelaciones representan un desafío significativo, especialmente para reservas con lead times prolongados. La implementación de políticas de depósito podría mitigar este riesgo.

  2. Optimización de Precios: La estacionalidad marcada sugiere oportunidades para implementar estrategias de revenue management más sofisticadas.

  3. Segmentación de Mercado: La diversidad de segmentos y orígenes geográficos requiere estrategias de marketing diferenciadas.

5.2 Recomendaciones

  • Implementar modelos predictivos para identificar reservas con alto riesgo de cancelación
  • Desarrollar políticas de pricing dinámico basadas en patrones estacionales
  • Focalizar esfuerzos de marketing en países con alta tasa de conversión
  • Optimizar la gestión de inventario durante temporadas pico

6. Referencias y Recursos

  • Dataset original: Hotel Booking Demand Datasets (Antonio, Almeida & Nunes, 2019)
  • Análisis realizado con R versión R version 4.5.1 (2025-06-13 ucrt)
  • Paquetes utilizados: tidyverse, plotly, ggplot2, y otros

Nota: Este documento ha sido generado de forma reproducible. Todas las visualizaciones son interactivas y pueden explorarse en el formato HTML.

cat("Información de la sesión:\n")
## Información de la sesión:
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Spanish_Spain.utf8  LC_CTYPE=Spanish_Spain.utf8   
## [3] LC_MONETARY=Spanish_Spain.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=Spanish_Spain.utf8    
## 
## time zone: Europe/Madrid
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.6.5      viridisLite_0.4.2  gridExtra_2.3      scales_1.4.0      
##  [5] kableExtra_1.4.0   knitr_1.50         DT_0.34.0          plotly_4.11.0     
##  [9] lubridate_1.9.4    forcats_1.0.1      stringr_1.6.0      dplyr_1.1.4       
## [13] purrr_1.2.0        readr_2.1.6        tidyr_1.3.1        tibble_3.3.0      
## [17] tidyverse_2.0.0    ggstatsplot_0.13.3 fitdistrplus_1.2-4 survival_3.8-3    
## [21] MASS_7.3-65        ggmosaic_0.4.0     ggplot2_4.0.1     
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6           xfun_0.54              bslib_0.9.0           
##  [4] bayestestR_0.17.0      htmlwidgets_1.6.4      insight_1.4.2         
##  [7] ggrepel_0.9.6          lattice_0.22-7         tzdb_0.5.0            
## [10] paletteer_1.6.0        crosstalk_1.2.2        vctrs_0.6.5           
## [13] tools_4.5.1            generics_0.1.4         datawizard_1.3.0      
## [16] pkgconfig_2.0.3        Matrix_1.7-3           data.table_1.17.8     
## [19] RColorBrewer_1.1-3     correlation_0.8.8      S7_0.2.1              
## [22] RcppParallel_5.1.11-1  lifecycle_1.0.4        compiler_4.5.1        
## [25] farver_2.1.2           textshaping_1.0.4      codetools_0.2-20      
## [28] htmltools_0.5.8.1      sass_0.4.10            yaml_2.3.10           
## [31] lazyeval_0.2.2         pillar_1.11.1          jquerylib_0.1.4       
## [34] cachem_1.1.0           statsExpressions_1.7.1 tidyselect_1.2.1      
## [37] digest_0.6.38          stringi_1.8.7          rematch2_2.1.2        
## [40] labeling_0.4.3         splines_4.5.1          fastmap_1.2.0         
## [43] grid_4.5.1             cli_3.6.5              magrittr_2.0.4        
## [46] patchwork_1.3.2        productplots_0.1.2     withr_3.0.2           
## [49] timechange_0.3.0       rmarkdown_2.30         httr_1.4.7            
## [52] hms_1.1.4              evaluate_1.0.5         parameters_0.28.2     
## [55] rstantools_2.5.0       rlang_1.1.6            Rcpp_1.1.0            
## [58] zeallot_0.2.0          glue_1.8.0             xml2_1.4.1            
## [61] svglite_2.2.2          rstudioapi_0.17.1      jsonlite_2.0.0        
## [64] effectsize_1.0.1       R6_2.6.1               plyr_1.8.9            
## [67] systemfonts_1.3.1